## The Role of Case Management in Misdemeanor Prosecution
## Lindsay Graef and Aurelie Ouss
##
## TABLES AND FIGURES
## This script creates all tables for the paper and figures A1 through A3

library(daotools)
library(lubridate)
library(tidyr)
library(stringr)
library(ggplot2)
library(tidytable) # load tidytable before tidyverse
library(tidyverse)
library(kableExtra)
library(fixest)
library(modelsummary) # version 1.4.3  # install.packages("https://cran.r-project.org/src/contrib/Archive/modelsummary/modelsummary_1.4.3.tar.gz", repos = NULL, type = "source")
library(janitor)
library(fastDummies)
library(conflicted)
library(readtext)
library(readxl)
library(NIBRSFunction)
library(patchwork)
library(extrafont)

# Set conflict resolutions
conflicts_prefer(tidytable::filter(),
                 tidytable::arrange(),
                 tidytable::mutate(),
                 tidytable::rename(),
                 tidytable::left_join(),
                 tidytable::slice(),
                 tidytable::distinct(),
                 tidytable::summarize(),
                 tidytable::select(),
                 tidytable::case_when(),
                 tidytable::ifelse(),
                 tidytable::pivot_longer(),
                 tidytable::pivot_wider(),
                 tidytable::bind_rows(),
                 tidytable::`%in%`
)

# set seed for replication
set.seed(52285)

## FINAL COLOR PALETTE:
# "#94D2BD" : light green
# "#0A9396" : teal
# "#005F73" : dark blue

#-------------------------------------------------------------------------------#
# FUNCTIONS --------------------------------------------------------------------#
#-------------------------------------------------------------------------------#

# Get up / down charging decisions
# Pulls in up / down charging information and merges onto main dataframe;
# used for balance tests.
getUpDownCharges <- function(df){
  
  # get simple up / down decisions 
  updown <- readRDS("/srv/data/penn/benchmark/updown_charging_charges_clean_expanded.rds") %>%
    select(arrest_id, upgrade_or_downgrade)
  
  # link to dc-pids and summarize by dc-pid before merging onto main df
  arrests <- load_arrests() %>%
    select(arrest_id, dc_number, defendant_pid) %>%
    mutate(dc_pid = paste0(dc_number,";",defendant_pid)) %>%
    left_join(., updown, by = "arrest_id") %>%
    mutate(upcharge = ifelse(any(upgrade_or_downgrade == "Upgrade"), 1, 0), .by = dc_pid) %>%
    mutate(downcharge = ifelse(any(upgrade_or_downgrade == "Downgrade"), 1, 0), .by = dc_pid) %>%
    distinct(dc_pid, .keep_all = T) %>%
    select(dc_pid, upcharge, downcharge)
  
  df <- df %>%
    left_join(., arrests, by = "dc_pid")
  
  return(df)
}

## Get Arrest Narrative Lengths
# get character and word count of arrest narratives for balance tests
# for each dc-pid, takes the max char and word count, since sometimes
# affidavits go through several versions before warrant is executed-- we take
# the longest one.
getArrestNarrativeLength <- function(df){
  
  narratives <- readRDS("/srv/data/penn/arrest_narratives_nlp/arrest_and_affidavit_narratives.rds") %>%
    mutate(facts_all = paste(facts_description, affidavit_facts_description)) %>%
    mutate(char_count = nchar(facts_all),
           word_count = sapply(strsplit(facts_all, "\\s+"), length)) %>%
    select(arrest_id, facts_all, char_count, word_count)
  
  # link to dc-pids and summarize by dc-pid before merging onto main df
  arrests <- load_arrests() %>%
    select(arrest_id, dc_number, defendant_pid) %>%
    mutate(dc_pid = paste0(dc_number,";",defendant_pid)) %>%
    left_join(., narratives, by = "arrest_id") %>%
    mutate(char_count = max(char_count), .by = dc_pid) %>%
    mutate(word_count = max(word_count), .by = dc_pid) %>%
    distinct(dc_pid, .keep_all = T) %>%
    select(dc_pid, char_count, word_count)
  
  df <- df %>%
    left_join(., arrests, by = "dc_pid")
  
  return(df)
}


## Clean PA Statute variables
# separates out title, section, and subsection into new variables before using NIBRS function
cleanStatutes <- function(df){
  
  df$statute_title <- str_extract(df$lead_charge_offense_code_with_subsection,"^[0-9]+")
  df$statute_section <- str_extract(df$lead_charge_offense_code_with_subsection,"[^0-9] [0-9]+-?.?([0-9]+)?") 
  df$statute_subsection <- str_extract(df$lead_charge_offense_code_with_subsection,"[^0-9]([A-Z]*[0-9]*[0-9A-Z]?)$") 
  # remove the squigglies, remove "_" at end of some sections
  df$statute_subsection <- str_extract(df$statute_subsection, "([A-Z]*[0-9]*[0-9A-Z]?)$")
  df$statute_section <- gsub("[^0-9] ([0-9]+-?.?([0-9]+)?)","\\1",df$statute_section) 
  df$statute_section <- gsub("_$","",df$statute_section) 
  
  return(df)
}


## Get Crime Categories
# Creates variable categorizing offenses by NIBRS and UCR codes
getCrimeCat <- function(df){
  
  df <- df %>%
    mutate(crimebigcat = case_when(
      grepl("Driving Under", nibrs_desc)|grepl("Driving Under", ucr_desc) ~ "DUI",
      grepl("Assault|Weapon|Sex|Manslaughter|Human", ucr_desc)|
        grepl("Kidnapping|Reckless Endangerment|Rape|Robbery|Sexual Assault", nibrs_desc)|
        grepl("Make Repairs/Sell/Etc Offens Weap|Poss Instrument Of Crime W/Int|Corpse|Bomb Threat|Propel Missile Into Occ Vehicles|Poss. Firearm In Ct. Facility For Use In Crime|Uses Incapacitation Device", lead_charge_description) ~ "Violent",
      grepl("Drug", ucr_desc)|
        grepl("Drug", nibrs_desc)|
        # note: many "controlled substance" are DUIs; important to keep DUI cat at top of code
        grepl("Controlled Substance|Disp Cont Subst|Furnishing Drug Free Urine|Know Dist Sche I Or Ii Contd Subs|Sale Give Contr Subs To Dep Person|Smell/Inhale Toxic Releasing Substances|Adult/Muti/Dest Label",lead_charge_description) ~ "Drugs",
      grepl("Disorderly|Liquor|Vagrancy|Prostitution|Drunkenness", ucr_desc)|
        grepl("Trespass|Obscene|Drunkenness", nibrs_desc)|
        grepl("Corruption Of Minors|Deposit Trash On Street|Dogs Not Validly Registered|Intentional Desecration Of Public Monument|Operating Watercraft|Scatter Rubbish|Tattooing Minor", lead_charge_description) ~ "Public Order",
      grepl("Stolen|theft|Theft|Vandalism|Fraud|Forgery", ucr_desc)|
        grepl("Bad Checks|Contraband|Burglary|Arson", nibrs_desc)|
        grepl("Conceal Info Affecting Benefits|Conspiracy|Unauthorized Sale/Transfer Of Tickets|Unlawful Use Recording Device In Theater|Conceal Info Affecting Benefits|Recorde?d? Device", lead_charge_description) ~ "Property",
      TRUE ~ "Other"))
  
  return(df)
}


## Get Recidivism Details
# gets dataset with crime types for first rearrest for defendants / cases 
# in our main analysis dataset
getRecidDetails <- function(df_team){
  
  # get first arrest for each defendant in our dataset
  recid <- df_team %>%
    filter(rearrest_win_2yr_fromOGopen == 1) %>%
    distinct(docket_number, .keep_all = T) %>%
    arrange(defendant_pid, case_open_date) %>%
    select(defendant_pid, docket_number, case_open_date, crimebigcat) %>%
    slice(1, .by = defendant_pid)
  
  # now go find the rearrests for each defendant
  cases <- load_cases() %>%
    # just get info for defendants we need
    filter(defendant_pid %in% recid$defendant_pid) %>%
    cleanStatutes() %>%
    get_nibrs_ucr_cats(as.data.frame(.),
                       title_var = statute_title,
                       section_var = statute_section,
                       subsection_var = statute_subsection,
                       desc_var = lead_charge_description,
                       add_flag = FALSE) %>%
    getCrimeCat(.) %>%
    select(defendant_pid, docket_number, case_open_date, crimebigcat) %>%
    filter(!docket_number %in% recid$docket_number)
  
  recid <- recid %>%
    rbind(., cases) %>%
    arrange(defendant_pid, case_open_date) %>%
    mutate(arrest_number = row_number(), .by = defendant_pid) %>%
    # indicator for cases in our main sample
    mutate(case_in_sample = ifelse(docket_number %in% df_team$docket_number, 1, 0)) %>%
    # keep the first rearrest by defendant after cases in our sample
    filter(dplyr::lag(case_in_sample == 1), .by = defendant_pid)
  
  return(recid)
}

## Get ADA IDs
# creates random but consecutive numeric, anonymized ID numbers by ADA name
getADAIDs <- function(df) {
  
  df <- df %>%
    distinct(ada_name) %>%
    mutate(ada_ID = sample(n())) %>%
    arrange(ada_ID) %>%
    mutate(ada_ID = row_number())
  
  return(df)
}

## Get Main Sample
# loads our main dataset that was used for benchmarking
getMainSample <- function(){
  
  # Get hearing-level dataset with teams of all MC ADAs to touch the case
  df_team <- readRDS("/srv/data/penn/what_makes_an_effective_prosecutor/final_benchmark_df.rds") %>%
    # Keep only listings in MC room
    filter(court_room_num %in% c("403","406","503","506","603","606","703","706","803","806","903","906","1103")) %>%
    # keep hearings with at least one ADA identified 
    filter(!is.na(ada_name)) %>%
    # keep hearings for ADAs with caseloads > 50
    mutate(caseload = n(), .by = ada_name) %>%
    filter(caseload >= 50) %>%
    # drop levels of factor ADA names that don't exist after filtering
    droplevels(.) %>%
    # create anonymized ADA ID numbers, then join back to main dataset
    left_join(., getADAIDs(.), by = "ada_name") %>%
    # need key for unique docket-listing-adas 
    mutate(docket_listing_ada = paste(docket_number, listing_number, ada_number)) %>%
    # drop if outcomes are missing (drops one case) 
    filter(!is.na(recid_win_2yr_fromOGopen)) %>%
    mutate(convicted_acquitted = ifelse(conviction == 1 | acquitted == 1, 1, 0)) %>%
    mutate(dismissed = ifelse(dispo_type_update %in% c("404 Withdrawn","Dismissed/Withdrawn/Etc"), 1, 0))
  
  # drop a few hearings with unclear / conflicting court and judge information (need distinct ADA-docket-listings)
  i <- which(duplicated(df_team$docket_listing_ada))
  a <- df_team[i,]
  df_team <- df_team %>% 
    filter(!docket_listing_ada %in% a$docket_listing_ada)
  
  return(df_team)
}

#-------------------------------------------------------------------------------#
# LOAD FINAL DATASET AND BENCHMARKING RESULTS ----------------------------------#
#-------------------------------------------------------------------------------#

# load main dataset
df_team <- getMainSample()

# load benchmarking results
results_team <- read_csv("/srv/data/penn/what_makes_an_effective_prosecutor/results_team.csv")
results_robust <- read_csv("/srv/data/penn/what_makes_an_effective_prosecutor/results_robust.csv")

#-------------------------------------------------------------------------------#
# MAKE TABLES ------------------------------------------------------------------#
#-------------------------------------------------------------------------------#

## Get Descriptive Statistics Table
# takes dataset of all hearings and ADAs and summarizes
# to hearing, case, and defendant level descriptive statistics
getDescriptiveStatsTable <- function(df){
  
  # get case and defendant characteristics
  tab1 <- df %>%
    distinct(docket_number, .keep_all = T) %>%
    mutate(is_dv = ifelse(is_dv == TRUE, 1, 0)) %>%
    summarize(Male = round(mean(male, na.rm = T), digits = 2),
              Black = round(mean(black), digits = 2),
              Hispanic = round(mean(hisp), digits = 2),
              `Defendant Age at Arrest` = round(mean(def_age_arrest,na.rm = T), digits = 2),
              `Violent` = round(mean(violent), digits = 2),
              `Drugs` = round(mean(drug), digits = 2),
              `DUI` = round(mean(dui), digits = 2),
              `Property` = round(mean(property), digits = 2),
              `Public Order` = round(mean(public_order), digits = 2),
              `DV` = round(mean(is_dv), digits = 2),
              `Number of Current Charges` = round(mean(num_charges, na.rm = T), digits = 2),
              `Prior Case in Past Year` = round(mean(prior_past_yr), digits = 2),
              `Prior Arrest in Past Year` = round(mean(priorarrest_past_year), digits = 2),
              `Prior Misdemeanor Conviction` = round(mean(prior_misd_conviction), digits = 2),
              `Prior Felony Conviction` = round(mean(prior_fel_conviction), digits = 2),
              `Prior Violent Conviction` = round(mean(prior_violent_conviction), digits = 2),
              `Prior Incarceration` = round(mean(prior_incarceration), digits = 2),
              `Number of Prior Arrests` = round(mean(number_of_prior_arrests,na.rm = T), digits = 2),
              `Pending Case at Open` = round(mean(pending_case_at_open), digits = 2),
              `On Probation at Case Open` = round(mean(on_probation), digits = 2),
              `Bail Posted Within 3 Days` = round(mean(bail_posted_in_3days, na.rm = T), digits = 2)
    ) %>%
    tidytable::pivot_longer(.,
                            names_to = "variable",
                            values_to = "mean") 
  
  # get hearing outcomes
  tab2 <- df %>%
    distinct(id_dlc, .keep_all = T) %>%
    summarize(`CW FTA` = round(mean(cw_fta_athearing), digits = 2),
              `Return FTA` = round(mean(return_fta), digits = 2),
              `Disco Incomplete` = round(mean(disco_incomplete), digits = 2),
              `CW Not Ready` = round(mean(cw_notready_combo), digits = 2),
              `Marked MBT` = round(mean(marked_mbt), digits = 2)
    ) %>%
    tidytable::pivot_longer(.,
                            names_to = "variable",
                            values_to = "mean") 
  
  # get case and defendant outcomes
  tab3 <- df %>%
    distinct(docket_number, .keep_all = T) %>%
    summarize(`Case Length Days` = round(mean(case_length_days), digits = 2),
              Convicted = round(mean(conviction), digits = 2),
              Punished = round(mean(punished), digits = 2),
              Incarcerated = round(mean(any_incarceration), digits = 2),
              `Probation Only` = round(mean(probation_only), digits = 2),
              `Convicted with No Formal Punishment` = round(mean(convicted_no_sentence), digits = 2),
              `New Arrest within 2 Years` = round(mean(rearrest_win_2yr_fromOGopen), digits = 2),
              `New Charge within 2 Years` = round(mean(recid_win_2yr_fromOGopen), digits = 2),
              `Cases` = n()
    ) %>%
    tidytable::pivot_longer(.,
                            names_to = "variable",
                            values_to = "mean") 
  
  # add total number of hearings to the bottom of table
  tab4 <- df %>%
    distinct(id_dlc, .keep_all = T) %>%
    summarize(`Hearings` = n()) %>%
    tidytable::pivot_longer(.,
                            names_to = "variable",
                            values_to = "mean")
  
  # add total number of hearings to the bottom of table
  tab5 <- df %>%
    summarize(`ADA Hearings` = n()) %>%
    tidytable::pivot_longer(.,
                            names_to = "variable",
                            values_to = "mean")
  
  # add total number of hearings to the bottom of table
  tab6 <- df %>%
    distinct(ada_name, .keep_all = T) %>%
    summarize(`Unique ADAs` = n()) %>%
    tidytable::pivot_longer(.,
                            names_to = "variable",
                            values_to = "mean")
  
  tab <- rbind(tab1, tab2) %>%
    rbind(., tab3) %>%
    rbind(., tab4) %>%
    rbind(., tab5) %>%
    rbind(., tab6) %>%
    kable(caption = "Descriptive Statistics", booktabs = T, format = "latex") %>%
    kable_styling(latex_options = "striped") %>%
    # Put Row 2 to Row 5 into a Group and label it as "Group A"
    pack_rows("Defendant and Case Characteristics", 1, 21) %>%
    pack_rows("Hearing Outcomes", 22, 26) %>%
    pack_rows("Case Outcomes", 27, 32) %>%
    pack_rows("Defendant Outcomes", 33, 34)
  
  return(tab)
}


## Get ADA Raw Outcomes Table
# gets sample table for raw outcomes for a randomly selected ADA
getADARawOutcomesTable <- function(df_team, results_team, sampled_ada){
  
  # pull results for randomly selected ADA
  ada <- readRDS(paste0("/srv/data/penn/what_makes_an_effective_prosecutor/benchmark_res_hearing_team/",gsub(" ","_",sampled_ada),".rds"))
  adaID <- results_team[results_team$ADA == sampled_ada,]$ada_ID
  
  # get raw outcomes for randomly selected ADA
  ada_raw_outcomes.1 <- df_team %>%
    filter(ada_name == sampled_ada) %>%
    summarize(`CW FTA` = round(mean(cw_fta_athearing), digits = 2),
              `Return FTA` = round(mean(return_fta), digits = 2),
              `Disco Incomplete` = round(mean(disco_incomplete), digits = 2),
              `CW Not Ready` = round(mean(cw_notready_combo), digits = 2),
              `Marked MBT` = round(mean(marked_mbt), digits = 2),
              Observations = n()) %>%
    pivot_longer(.,
                 names_to = "variable",
                 values_to = "mean")
  
  ada_raw_outcomes.2 <- df_team %>%
    filter(ada_name == sampled_ada) %>%
    distinct(docket_number, .keep_all = T) %>%
    summarize(`Case Length Days` = round(mean(case_length_days), digits = 2),
              Convicted = round(mean(conviction), digits = 2),
              Punished = round(mean(punished), digits = 2),
              Incarcerated = round(mean(any_incarceration), digits = 2),
              `Probation Only` = round(mean(probation_only), digits = 2),
              `Convicted with No Formal Punishment` = round(mean(convicted_no_sentence), digits = 2),
              `New Arrest within 2 Years` = round(mean(rearrest_win_2yr_fromOGopen), digits = 2),
              `New Charge within 2 Years` = round(mean(recid_win_2yr_fromOGopen), digits = 2)) %>%
    pivot_longer(.,
                 names_to = "variable",
                 values_to = "mean")
  
  ada_raw_outcomes <- rbind(ada_raw_outcomes.1, ada_raw_outcomes.2)
  
  # get outcomes for unweighted controls
  uw_controls <- df_team %>%
    # label ADA i's cases as "treat"
    mutate(treat = ifelse(ada_name == sampled_ada, 1, 0)) %>%
    # drop rows for teammates who handled the same case (don't count towards benchmark)
    mutate(teammate = ifelse(any(treat == 1), 1, 0), .by = docket_number) %>%
    mutate(drop = ifelse(treat == 0 & teammate == 1, 1, 0)) %>%
    filter(drop == 0) %>%
    mutate(docket_listing_ada = paste(docket_number, listing_number, ada_number)) %>%
    distinct(docket_listing_ada, .keep_all = T) %>%
    filter(treat == 0) 
  
  uw_controls.1 <- uw_controls %>%
    summarize(`CW FTA` = round(mean(cw_fta_athearing), digits = 2),
              `Return FTA` = round(mean(return_fta), digits = 2),
              `Disco Incomplete` = round(mean(disco_incomplete), digits = 2),
              `CW Not Ready` = round(mean(cw_notready_combo), digits = 2),
              `Marked MBT` = round(mean(marked_mbt), digits = 2),
              Observations = n()) %>%
    pivot_longer(.,
                 names_to = "variable",
                 values_to = "mean") %>%
    rename(`Unweighted Controls` = mean)
  
  uw_controls.2 <- uw_controls %>%
    distinct(docket_number, .keep_all = T) %>%
    summarize(`Case Length Days` = round(mean(case_length_days), digits = 2),
              Convicted = round(mean(conviction), digits = 2),
              Punished = round(mean(punished), digits = 2),
              Incarcerated = round(mean(any_incarceration), digits = 2),
              `Probation Only` = round(mean(probation_only), digits = 2),
              `Convicted with No Formal Punishment` = round(mean(convicted_no_sentence), digits = 2),
              `New Arrest within 2 Years` = round(mean(rearrest_win_2yr_fromOGopen), digits = 2),
              `New Charge within 2 Years` = round(mean(recid_win_2yr_fromOGopen), digits = 2)) %>%
    pivot_longer(.,
                 names_to = "variable",
                 values_to = "mean") %>%
    rename(`Unweighted Controls` = mean)
  
  uw_controls <- rbind(uw_controls.1, uw_controls.2)
  
  # pull out dataframe of benchmarking weights
  temp <- as.data.frame(ada$w)
  temp$docket_listing_ada <- rownames(temp)
  rownames(temp) <- NULL
  names(temp) <- c("weight","docket_listing_ada")
  
  # get outcomes for weighted controls
  w_controls <- df_team %>%
    # label ADA i's cases as "treat"
    mutate(treat = ifelse(ada_name == sampled_ada, 1, 0)) %>%
    # drop rows for teammates who handled the same case (don't count towards benchmark)
    mutate(teammate = ifelse(any(treat == 1), 1, 0), .by = docket_number) %>%
    mutate(drop = ifelse(treat == 0 & teammate == 1, 1, 0)) %>%
    filter(drop == 0) %>%
    filter(treat == 0) %>%
    mutate(docket_listing_ada = paste(docket_number, listing_number, ada_number)) %>%
    distinct(docket_listing_ada, .keep_all = T) %>%
    # merge in weights from benchmarking
    left_join(., temp) %>%
    filter(!is.na(weight)) 
  
  w_controls.1 <- w_controls %>%
    summarize(`CW FTA` = round(weighted.mean(cw_fta_athearing, weight), digits = 2),
              `Return FTA` = round(weighted.mean(return_fta, weight), digits = 2),
              `Disco Incomplete` = round(weighted.mean(disco_incomplete, weight), digits = 2),
              `CW Not Ready` = round(weighted.mean(cw_notready_combo, weight), digits = 2),
              `Marked MBT` = round(weighted.mean(marked_mbt, weight), digits = 2),
              Observations = ada$ESS) %>%
    pivot_longer(.,
                 names_to = "variable",
                 values_to = "mean") %>%
    rename(`Weighted Controls` = mean) 
  
  w_controls.2 <- w_controls %>%
    distinct(docket_number, .keep_all = T) %>%
    summarize(`Case Length Days` = round(weighted.mean(case_length_days, weight), digits = 2),
              Convicted = round(weighted.mean(conviction, weight), digits = 2),
              Punished = round(weighted.mean(punished, weight), digits = 2),
              Incarcerated = round(weighted.mean(any_incarceration, weight), digits = 2),
              `Probation Only` = round(weighted.mean(probation_only, weight), digits = 2),
              `Convicted with No Formal Punishment` = round(weighted.mean(convicted_no_sentence, weight), digits = 2),
              `New Arrest within 2 Years` = round(weighted.mean(rearrest_win_2yr_fromOGopen, weight), digits = 2),
              `New Charge within 2 Years` = round(weighted.mean(recid_win_2yr_fromOGopen, weight), digits = 2)) %>%
    pivot_longer(.,
                 names_to = "variable",
                 values_to = "mean") %>%
    rename(`Weighted Controls` = mean) 
  
  w_controls <- rbind(w_controls.1, w_controls.2)
  
  tab <- ada_raw_outcomes %>%
    left_join(., uw_controls) %>%
    left_join(., w_controls) %>%
    mutate(across(where(is.numeric), round, digits = 2)) %>%
    # move observations row to end
    arrange(dplyr::if_else(variable == "Observations", 1, 0)) %>%
    kable(caption = paste0("Raw Outcomes for ADA ",adaID), booktabs = T, format = "latex") %>%
    kable_styling(latex_options = "striped") %>%
    pack_rows("ADA Skills / Hearing Outcomes", 1, 5) %>%
    pack_rows("Case Outcomes", 6, 11) %>%
    pack_rows("Defendant Outcomes", 12, 13) 
  
  return(tab)
}


## Get ADA Caseload Balance Table
# gets sample table showing caseload balance for a randomly selected ADA
getCaseloadBalanceTable <- function(df_team, results_team, sampled_ada){
  
  # pull results for randomly selected ADA
  ada <- readRDS(paste0("/srv/data/penn/what_makes_an_effective_prosecutor/benchmark_res_hearing_team/",gsub(" ","_",sampled_ada),".rds"))
  adaID <- results_team[results_team$ADA == sampled_ada,]$ada_ID
  treat_label <- paste0("ADA ",adaID)
  
  # get balance tab from fastDR output (includes treated KS and weighted control KS)
  balance_tab <- ada$balance.tab %>%
    as.data.frame(.) %>%
    rownames_to_column(.) %>%
    mutate(rowname = gsub("lead_charge_offense_code_with_subsection:","",rowname)) %>%
    mutate(rowname = gsub(":","_",rowname)) %>%
    mutate(across(where(is.numeric), round, digits = 4)) %>%
    select(rowname, treatment, control, KS) %>%
    rename(variable = rowname,
           `Weighted Controls` = control)
  
  # get unweighted controls 
  # need full dataset used in benchmarking first
  df <- df_team %>%
    # label ADA i's cases as "treat"
    mutate(treat = ifelse(ada_name == sampled_ada, 1, 0)) %>%
    # drop rows for teammates who handled the same case (didn't count towards benchmark)
    mutate(teammate = ifelse(any(treat == 1), 1, 0), .by = docket_number) %>%
    mutate(drop = ifelse(treat == 0 & teammate == 1, 1, 0)) %>%
    filter(drop == 0) %>%
    dummy_cols(., select_columns = "lead_charge_offense_code_with_subsection") %>%
    dummy_cols(., select_columns = "lead_charge_grade_atCU_levels") %>%
    dummy_cols(., select_columns = "dc_district") %>%
    dummy_cols(., select_columns = "year") %>%
    dummy_cols(., select_columns = "is_dv") %>%
    dummy_cols(., select_columns = "judge") %>%
    mutate(docket_listing_ada = paste(docket_number, listing_number, ada_number))
  
  names(df) <- gsub("lead_charge_offense_code_with_subsection_","",names(df))
  
  # get unweighted controls
  unweighted_controls <- df %>%
    filter(treat == 0) %>%
    distinct(docket_listing_ada, .keep_all = T) %>%
    select(def_age_arrest,male,black,hisp,violent,drug,property,dui,public_order,
           starts_with("is_dv_"),starts_with("lead_charge_grade_atCU_levels_"),number_of_charges,has_pic_dao,has_conspiracy_dao,has_vufa_dao,
           has_resisting_dao,pending_case_at_open,number_of_prior_arrests,on_probation,
           bail_posted_in_3days,starts_with("prior_"),starts_with("year_"),starts_with("dc_district_"),
           def_atty_pd, starts_with("judge_"),matches("PaCS")) %>%
    summarize_all(., mean, na.rm = T) %>%
    mutate(across(where(is.numeric), round, digits = 4)) %>%
    pivot_longer(.,
                 cols = everything(),
                 names_to = "variable",
                 values_to = "mean") %>%
    filter(variable %in% balance_tab$variable) %>%
    rename(`Unweighted Controls` = mean) 
  
  tab <- left_join(unweighted_controls, balance_tab) %>%
    mutate(variable = gsub("_"," ",variable)) %>%
    mutate(variable = str_to_title(variable)) %>%
    select(variable, treatment, "Unweighted Controls",
           "Weighted Controls","KS")
  
  # Add n, ESS, max KS to last row of table (need to edit labeling slightly in LaTeX)
  observations <- data.frame(variable = "Observations",
                             treatment = ada$n1,
                             `Unweighted Controls` = nrow(df[treat == 0,]),
                             `Weighted Controls` = ada$ESS,
                             KS = round(ada$ks, digits = 4))
  
  # Customize labels by treatment variable
  tab <- tab %>% rename(!!treat_label := treatment)
  names(observations) <- c("variable", treat_label, "Unweighted Controls",
                           "Weighted Controls","KS")
  
  tab <- rbind(tab, observations)
  
  tab <- tab %>%
    mutate(across(where(is.numeric), round, digits = 2)) %>%
    kable(caption = "Caseload Balance", booktabs = T, format = "latex") %>%
    kable_styling(latex_options = "striped") %>%
    # Pack rows into groups and label
    pack_rows("Defendant and Case Characteristics", 1, 29) %>%
    pack_rows("Years", 30, 34) %>%
    pack_rows("Police Districts", 35, 50) %>%
    pack_rows("Workgroup Actors", 51, 81) %>%
    pack_rows("Specific Charges", 82, 129)
  
  return(tab)
}


## Get example of ADA-level dataset of z-scores
getSampleADAZScores <- function(results_team, sampled_ada){
  
  # randomly sample 5 ADAs (including the one we pulled earlier)
  sampled_ada_row <- results_team[results_team$ADA == sampled_ada, ]
  remaining_pool <- results_team[results_team$ADA != sampled_ada, ]
  additional_adas <- remaining_pool[sample(nrow(remaining_pool), 4), ]
  combined_adas <- rbind(sampled_ada_row, additional_adas)
  
  tab <- results_team %>%
    filter(ada_ID %in% combined_adas$ada_ID) %>%
    select(ada_ID, ends_with("z_norm")) %>%
    select(ada_ID,cw_fta_athearing_z_norm,disco_incomplete_z_norm,case_length_days_z_norm,
           conviction_z_norm,punished_z_norm,rearrest_win_2yr_fromOGopen_z_norm,
           recid_win_2yr_fromOGopen_z_norm) %>%
    rename("CW FTA at Hearing" = cw_fta_athearing_z_norm,
           "Disco Incomplete" = disco_incomplete_z_norm,         
           "Case Length Days" = case_length_days_z_norm,                
           "Conviction" = conviction_z_norm,
           "Formal Punishment" = punished_z_norm,
           "New Arrest" = rearrest_win_2yr_fromOGopen_z_norm,
           "New Case" = recid_win_2yr_fromOGopen_z_norm) %>%
    mutate(across(where(is.numeric), round, digits = 3)) %>%
    kable(caption = "Prosecutor z-scores", booktabs = T, format = "latex") %>%
    kable_styling(latex_options = "striped") %>%
    add_header_above(c(" " = 1, "Hearing Outcomes" = 2, "Case Outcomes" = 3, "Public Safety Outcomes" = 2))
  
  return(tab)
}


## Get Hearing-Case Outcome Regressions
# Gets regression of case outcomes on hearing outcomes
# Options for main results or results using adjusted z-scores
getHCRegsTab <- function(df, use_adj = FALSE) {
  
  # Variable selection based on the 'use_adj' flag
  suffix <- if (use_adj) ".adj_z" else "_z_norm"
  
  lm_length <- lm(as.formula(paste0("case_length_days", suffix, " ~ cw_notready_combo", suffix)), data = df)
  lm_conv <- lm(as.formula(paste0("conviction", suffix, " ~ cw_notready_combo", suffix)), data = df)
  lm_pun <- lm(as.formula(paste0("punished", suffix, " ~ cw_notready_combo", suffix)), data = df)
  
  tab <- modelsummary(list("Case Length" = lm_length,
                           "Conviction" = lm_conv,
                           "Formal Punishment" = lm_pun),
                      stars = TRUE,
                      title = "Association between ADA Hearing and Case Outcomes",
                      gof_map = c("nobs", "r.squared"),
                      output = "latex",
                      booktabs = TRUE) %>%
    kable_styling(latex_options = c("striped", "hold_position"),
                  position = "center", full_width = FALSE) %>%
    add_header_above(c(" " = 1, "Dependent Variable" = 3))
  
  return(tab)
}

## Get Case-Defendant Outcome Regressions
# Gets regression of public safety outcomes on defendant outcomes
# Options for main results or results using adjusted z-scores
getCDRegsTab <- function(df, use_adj = FALSE) {
  
  # Variable suffix based on the 'use_adj' flag
  suffix <- if (use_adj) ".adj_z" else "_z_norm"
  
  lm_rearrest1 <- lm(as.formula(paste0("rearrest_win_2yr_fromOGopen", suffix, " ~ conviction", suffix, " + case_length_days", suffix)), data = df)
  lm_rearrest2 <- lm(as.formula(paste0("rearrest_win_2yr_fromOGopen", suffix, " ~ punished", suffix, " + case_length_days", suffix)), data = df)
  lm_recid1 <- lm(as.formula(paste0("recid_win_2yr_fromOGopen", suffix, " ~ conviction", suffix, " + case_length_days", suffix)), data = df)
  lm_recid2 <- lm(as.formula(paste0("recid_win_2yr_fromOGopen", suffix, " ~ punished", suffix, " + case_length_days", suffix)), data = df)
  
  if(use_adj){
    coef_mapping <- c("conviction.adj_z" = "Conviction",
                      "punished.adj_z" = "Punished",
                      "case_length_days.adj_z" = "Case Length (Days)")
  }else{
    coef_mapping <- c("conviction_z_norm" = "Conviction",
                      "punished_z_norm" = "Formal Punishment",
                      "case_length_days_z_norm" = "Case Length (Days)")
  }
  
  tab <- modelsummary(list("(1)" = lm_rearrest1,
                           "(2)" = lm_rearrest2,
                           "(3)" = lm_recid1,
                           "(4)" = lm_recid2),
                      coef_map = coef_mapping,
                      stars = TRUE,
                      title = "Association between Case and Defendant Outcomes",
                      gof_map = c("nobs", "r.squared"),
                      output = "latex",
                      booktabs = TRUE) %>%
    kable_styling(latex_options = c("striped", "hold_position"),
                  position = "center", full_width = FALSE) %>%
    add_header_above(c(" " = 1, "New Arrest" = 2, "New Case" = 2)) %>%
    add_header_above(c(" " = 1, "Dependent Variable" = 4))
  
  return(tab)
}


## Get Robustness Regs Tab
# specify independent variable ("cw_fta_athearing" or "disco_incomplete")
# option to use adjusted z-scores
getRobustnessRegsTab <- function(df, use_adj = FALSE, iv_prefix) {
  
  # Choose suffix based on 'use_adj' flag
  suffix <- if (use_adj) ".adj_z" else "_z_norm"
  
  # Compose formulas using the chosen independent variable
  lm_length <- lm(as.formula(paste0("case_length_days", suffix, " ~ ", iv_prefix, suffix)), data = df)
  lm_conv <- lm(as.formula(paste0("conviction", suffix, " ~ ", iv_prefix, suffix)), data = df)
  lm_pun <- lm(as.formula(paste0("punished", suffix, " ~ ", iv_prefix, suffix)), data = df)
  
  tab <- modelsummary(list("Case Length" = lm_length,
                           "Conviction" = lm_conv,
                           "Formal Punishment" = lm_pun),
                      stars = TRUE,
                      title = "Association between ADA Hearing and Case Outcomes",
                      gof_map = c("nobs", "r.squared"),
                      output = "latex",
                      booktabs = TRUE) %>%
    kable_styling(latex_options = c("striped", "hold_position"),
                  position = "center", full_width = FALSE) %>%
    add_header_above(c(" " = 1, "Dependent Variable" = 3))
  
  return(tab)
}


# Make all tables
tab1 <- getDescriptiveStatsTable(df_team)

# Randomly sample an ADA with high convictions and low rearrests to use as an example
eligible_adas <- subset(results_team, conviction_z_norm > 0 & rearrest_win_2yr_fromOGopen_z_norm < 0 & ks < 0.05)
sampled_ada <- eligible_adas[sample(nrow(eligible_adas), 1), ]$ADA

tab2 <- getADARawOutcomesTable(df_team, results_team, sampled_ada)
tab3 <- getCaseloadBalanceTable(df_team, results_team, sampled_ada)

tabA2 <- getSampleADAZScores(results_team, sampled_ada)
tabA3 <- getHCRegsTab(results_team, use_adj = FALSE)
tabA4 <- getCDRegsTab(results_team, use_adj = FALSE)
tabA5 <- getHCRegsTab(results_team, use_adj = TRUE)
tabA6 <- getCDRegsTab(results_team, use_adj = TRUE)

# get robustness regressions, put together into panel table in LaTeX
tabA7.1 <- getRobustnessRegsTab(results_team, use_adj = FALSE, iv_prefix = "cw_fta_athearing")
tabA7.2 <- getRobustnessRegsTab(results_robust, use_adj = FALSE, iv_prefix = "cw_fta_athearing")
tabA7.3 <- getRobustnessRegsTab(results_team, use_adj = FALSE, iv_prefix = "disco_incomplete")
tabA7.4 <- getRobustnessRegsTab(results_robust, use_adj = FALSE, iv_prefix = "disco_incomplete")

# get robustness regressions, put together into panel table in LaTeX
tabA8.1 <- getCDRegsTab(results_team, use_adj = FALSE)
tabA8.2 <- getCDRegsTab(results_robust, use_adj = FALSE)

# What type of crimes were rearrests for?
recid <- getRecidDetails(df_team)

recid %>%
  summarize(count = n(), 
            .by = crimebigcat) %>%
  mutate(total = sum(count)) %>%
  mutate(percent = count / total)


#-------------------------------------------------------------------------------#
# OVERALL BALANCE TESTS --------------------------------------------------------#
#-------------------------------------------------------------------------------#

#-------------------------------------------------------------------------------#
# Balance on things we controlled for ------------------------------------------#
#-------------------------------------------------------------------------------#

# Make a set of balance tables for things we controlled for:
# male, prior arrest, violent, DUI, M lead charge, number of charges

getBalanceResultsControlVars <- function(df_team){
  
  setwd("/srv/data/penn/what_makes_an_effective_prosecutor/benchmark_res_hearing_team/")
  filenames <- list.files()
  results <- list()
  
  for(file in filenames){
    
    a <- readRDS(file) 
    
    if(a$ks <= 0.06){
      
      # recover ADA's name
      name <- gsub(".rds","",file) %>%
        gsub("_"," ", .)
      
      # get dataset used in benchmarking for ADA a
      temp <- df_team %>%
        # label ADA i's cases as "treat"
        mutate(treat = ifelse(ada_name == name, 1, 0)) %>%
        # drop rows for teammates who handled the same case (didn't count towards benchmark)
        mutate(teammate = ifelse(any(treat == 1), 1, 0), .by = docket_number) %>%
        mutate(drop = ifelse(treat == 0 & teammate == 1, 1, 0)) %>%
        filter(drop == 0) %>%
        mutate(lead_M1 = ifelse(lead_charge_grade_atCU_levels == "M1", 1, 0))
      
      # get unweighted controls
      tab.un.ctrl <- temp %>%
        filter(treat == 0) %>%
        summarize(p_priors = mean(priorarrest_past_year, na.rm = T),
                  p_violent = mean(violent, na.rm = T),
                  p_dui = mean(dui, na.rm = T),
                  p_mlead = mean(lead_M1, na.rm = T),
                  m_ncharges = mean(number_of_charges, na.rm = T),
                  p_black = mean(black, na.rm = T)) %>%
        mutate(type = "un_ctrl") %>%
        mutate(ada = name) %>%
        select(ada, type, p_priors, p_violent, p_dui, p_mlead, m_ncharges, p_black)
      
      # get treated group by pulling from fastDR balance tab
      tab.treat <- tibble(
        ada         = name,
        type        = "treat",
        p_priors    = a$balance.tab["prior_past_yr", "treatment"],
        p_violent   = a$balance.tab["violent", "treatment"],
        p_dui       = a$balance.tab["dui", "treatment"],
        p_mlead     = a$balance.tab["lead_charge_grade_atCU_levels:M1", "treatment"],
        m_ncharges  = a$balance.tab["number_of_charges", "treatment"],
        p_black     = a$balance.tab["black", "treatment"]
      )
      
      # get weighted controls by pulling from fastDR balance tab
      tab.w.ctrl <- tibble(
        ada         = name,
        type        = "w_ctrl",
        p_priors    = a$balance.tab["prior_past_yr", "control"],
        p_violent   = a$balance.tab["violent", "control"],
        p_dui       = a$balance.tab["dui", "control"],
        p_mlead     = a$balance.tab["lead_charge_grade_atCU_levels:M1", "control"],
        m_ncharges  = a$balance.tab["number_of_charges", "control"],
        p_black     = a$balance.tab["black", "control"]
      )
      
      tab <- rbind(tab.treat, tab.un.ctrl, tab.w.ctrl)
      
      # append to main DF of results
      results <- append(results, list(tab))
      
      print(paste0("finished ADA ",name))
    }
    
    else {
    message(paste0("skipped ", file, " (ks = ", a$ks, ")"))
    }
    
  }
  
  balance_results <- bind_rows(results)
  
  return(balance_results)
}

balance_results <- getBalanceResultsControlVars(df_team) %>%
  distinct(.) %>%
  pivot_wider(names_from = "type",
              values_from = c("p_priors", "p_violent", "p_dui", "p_mlead", "m_ncharges", "p_black")) %>%
  mutate(
    raw_diff_priors    = p_priors_treat    - p_priors_un_ctrl,
    bench_diff_priors  = p_priors_treat    - p_priors_w_ctrl,
    raw_diff_violent   = p_violent_treat   - p_violent_un_ctrl,
    bench_diff_violent = p_violent_treat   - p_violent_w_ctrl,
    raw_diff_dui       = p_dui_treat       - p_dui_un_ctrl,
    bench_diff_dui     = p_dui_treat       - p_dui_w_ctrl,
    raw_diff_mlead     = p_mlead_treat     - p_mlead_un_ctrl,
    bench_diff_mlead   = p_mlead_treat     - p_mlead_w_ctrl,
    raw_diff_ncharges  = m_ncharges_treat  - m_ncharges_un_ctrl,
    bench_diff_ncharges= m_ncharges_treat  - m_ncharges_w_ctrl,
    raw_diff_black     = p_black_treat     - p_black_un_ctrl,
    bench_diff_black   = p_black_treat     - p_black_w_ctrl
  ) %>%
  pivot_longer(cols = starts_with("raw_diff_") | starts_with("bench_diff_"),
               names_to = "variable",
               values_to = "value") %>%
  select(ada, variable, value)


# Define custom plotting function
make_hist_plot <- function(data, var, label, show_legend = FALSE) {
  var_names <- c(paste0("raw_diff_", var), paste0("bench_diff_", var))
  
  p <- ggplot(data %>% filter(variable %in% var_names),
              aes(x = value, fill = variable)) +
    geom_histogram(alpha = 0.6, position = "identity", bins = 40) +
    labs(x = label, 
         y = "Count of MC Unit ADAs", 
         fill = "") +
    scale_fill_manual(
      values = setNames(c("darkblue", "red"), var_names),
      labels = setNames(c("Raw\nDifference", "Benchmarked\nDifference"), var_names)
    ) +
    theme_light(base_family = "Times New Roman") +
    theme(text = element_text(family = "Times New Roman"),
          legend.text = element_text(family = "Times New Roman"),
          axis.title = element_text(family = "Times New Roman"),
          axis.text = element_text(family = "Times New Roman"),
          legend.title = element_text(family = "Times New Roman"),
          plot.title = element_text(family = "Times New Roman"),
          strip.text = element_text(family = "Times New Roman"))
  
  if (!show_legend) {
    p <- p + theme(legend.position = "none")
  }
  
  return(p)
}

figA1.1 <- make_hist_plot(balance_results, "priors", "Difference in Proportion with Prior Arrest")
figA1.2 <- make_hist_plot(balance_results, "violent", "Difference in Proportion with Violent Charges")
figA1.3 <- make_hist_plot(balance_results, "dui", "Difference in Proportion with DUI Charges")
figA1.4 <- make_hist_plot(balance_results, "mlead", "Difference in Proportion with M1 Lead Charges", show_legend = TRUE)
figA1.5 <- make_hist_plot(balance_results, "ncharges", "Difference in Mean Number of Charges")
figA1.6 <- make_hist_plot(balance_results, "black", "Difference in Proportion of Black Defendants")

# Combine in a 3x2 grid
final_plot <- (figA1.1 | figA1.2 ) /
  (figA1.3 | figA1.4 ) /
  (figA1.5 | figA1.6)

# Export as one figure
ggsave("~/dev/dao-data/penn_research/what_makes_an_effective_prosecutor/exports/figA1.png", 
       final_plot, width = 9, height = 10, dpi = 300)


#-------------------------------------------------------------------------------#
# Balance on Unobservables -----------------------------------------------------#
#-------------------------------------------------------------------------------#

## Get Balance Results on Unobservables
# To show balance on other covariates, gets the following information by treated ADA,
# unweighted controls, and weighted controls (benchmark): 
# 1: up / down charging decisions at charging phase
# 2: arrest narrative length
# takes main df_team dataset, returns a dataframe of balance results for each ADA
getBalanceResultsUnobservables <- function(df_team){
  
  df_team <- df_team %>%
    getUpDownCharges(.) %>%
    getArrestNarrativeLength(.)
  
  setwd("/srv/data/penn/what_makes_an_effective_prosecutor/benchmark_res_hearing_team/")
  filenames <- list.files()
  results <- list()
  
  for(file in filenames){
    
    a <- readRDS(file) 
    
    if(a$ks <= 0.06){
      
      # recover ADA's name
      name <- gsub(".rds","",file) %>%
        gsub("_"," ", .)
      
      # get dataset used in benchmarking for ADA a
      temp <- df_team %>%
        # label ADA i's cases as "treat"
        mutate(treat = ifelse(ada_name == name, 1, 0)) %>%
        # drop rows for teammates who handled the same case (didn't count towards benchmark)
        mutate(teammate = ifelse(any(treat == 1), 1, 0), .by = docket_number) %>%
        mutate(drop = ifelse(treat == 0 & teammate == 1, 1, 0)) %>%
        filter(drop == 0) 
      
      tab.treat <- temp %>%
        filter(treat == 1) %>%
        summarize(p_upcharge = mean(upcharge, na.rm = T),
                  p_downcharge = mean(downcharge, na.rm = T),
                  chars = mean(char_count),
                  words = mean(word_count)) %>%
        mutate(type = "treat") %>%
        mutate(ada = name) %>%
        select(ada, type, p_upcharge, p_downcharge, chars, words)
      
      tab.un.ctrl <- temp %>%
        filter(treat == 0) %>%
        summarize(p_upcharge = mean(upcharge, na.rm = T),
                  p_downcharge = mean(downcharge, na.rm = T),
                  chars = mean(char_count),
                  words = mean(word_count)) %>%
        mutate(type = "un_ctrl") %>%
        mutate(ada = name) %>%
        select(ada, type, p_upcharge, p_downcharge, chars, words)
      
      # get weighted controls
      # pull out dataframe of benchmarking weights to calculate means for unobservables
      ww <- as.data.frame(a$w)
      ww$docket_listing_ada <- rownames(ww)
      rownames(ww) <- NULL
      names(ww) <- c("weight","docket_listing_ada")
      
      # get outcomes for weighted controls
      tab.w.ctrl <- temp %>%
        # merge in weights from benchmarking
        left_join(., ww) %>%
        filter(!is.na(weight)) %>%
        summarize(p_upcharge = weighted.mean(upcharge, weight, na.rm = T),
                  p_downcharge = weighted.mean(downcharge, weight, na.rm = T),
                  chars = weighted.mean(char_count, weight),
                  words = weighted.mean(word_count, weight)) %>%
        mutate(avg_age = a$balance.tab[[1]]) %>%
        mutate(p_male = a$balance.tab[[2]]) %>%
        mutate(type = "w_ctrl") %>%
        mutate(ada = name) %>%
        select(ada, type, p_upcharge, p_downcharge, chars, words)
      
      tab <- rbind(tab.treat, tab.un.ctrl, tab.w.ctrl)
      
      # append to main DF of results
      results <- append(results, list(tab))
    }
  }
  
  balance_results <- bind_rows(results)
  
  return(balance_results)
}

balance_results_uo <- getBalanceResultsUnobservables(df_team) %>%
  distinct(.) %>%
  pivot_wider(., 
              names_from = "type",
              values_from = c("p_upcharge", "p_downcharge","words","chars")) %>%
  mutate(raw_diff_upcharge = p_upcharge_treat - p_upcharge_un_ctrl,
         bench_diff_upcharge = p_upcharge_treat - p_upcharge_w_ctrl,
         raw_diff_downcharge = p_downcharge_treat - p_downcharge_un_ctrl,
         bench_diff_downcharge = p_downcharge_treat - p_downcharge_w_ctrl,
         raw_diff_words = words_treat - words_un_ctrl,
         bench_diff_words = words_treat - words_w_ctrl,
         raw_diff_chars = chars_treat - chars_un_ctrl,
         bench_diff_chars = chars_treat - chars_w_ctrl) %>%
  pivot_longer(cols = c(raw_diff_upcharge, bench_diff_upcharge, 
                        raw_diff_downcharge, bench_diff_downcharge,
                        raw_diff_words, bench_diff_words,
                        raw_diff_chars, bench_diff_chars), 
               names_to = "variable", 
               values_to = "value") %>%
  select(ada, variable, value)


figA2 <- ggplot(balance_results_uo %>% filter(variable %in% c("raw_diff_words", "bench_diff_words") & value < 800), # there is one very large value that skews plot- truncate and note in figure notes
                aes(x = value, fill = variable)) +
  geom_histogram(alpha = 0.6, position = "identity", bins = 40) +
  labs(x = "Difference in Average Arrest Narrative Length (in Words) of Caseload", 
       y = "Count of MC Unit ADAs", 
       fill = "") +
  scale_fill_manual(values = c("raw_diff_words" = "darkblue", "bench_diff_words" = "red"),
                    labels = c("raw_diff_words" = "Raw\nDifference", 
                               "bench_diff_words" = "Benchmarked\nDifference")) +
  scale_x_continuous(breaks = seq(-200, 300, by = 100)) +
  theme_light(base_family = "Times New Roman") +
  theme(text = element_text(family = "Times New Roman"),
        legend.text = element_text(family = "Times New Roman"),
        axis.title = element_text(family = "Times New Roman"),
        axis.text = element_text(family = "Times New Roman"),
        legend.title = element_text(family = "Times New Roman"),
        plot.title = element_text(family = "Times New Roman"),
        strip.text = element_text(family = "Times New Roman"))

figA2
ggsave("~/dev/dao-data/penn_research/what_makes_an_effective_prosecutor/exports/figA2.png", 
       device = "png", width = 10, height = 4, units = c("in"), dpi = 300)


figA3.1 <- ggplot(balance_results_uo %>% filter(variable %in% c("raw_diff_upcharge", "bench_diff_upcharge")), 
                  aes(x = value, fill = variable)) +
  geom_histogram(alpha = 0.6, position = "identity", bins = 30) +
  labs(x = "Difference in Percent of Caseload Upcharged by Prior ADA", 
       y = "Count of MC Unit ADAs", 
       fill = "") +
  scale_fill_manual(values = c("raw_diff_upcharge" = "darkblue", "bench_diff_upcharge" = "red"),
                    labels = c("raw_diff_upcharge" = "Raw\nDifference", 
                               "bench_diff_upcharge" = "Benchmarked\nDifference")) +
  scale_x_continuous(breaks = seq(floor(min(balance_results_uo$value, na.rm = TRUE)), 
                                  ceiling(max(balance_results_uo$value, na.rm = TRUE)), 
                                  by = 0.05)) +
  theme_light(base_family = "Times New Roman") +
  theme(text = element_text(family = "Times New Roman"),
        legend.text = element_text(family = "Times New Roman"),
        axis.title = element_text(family = "Times New Roman"),
        axis.text = element_text(family = "Times New Roman"),
        legend.title = element_text(family = "Times New Roman"),
        plot.title = element_text(family = "Times New Roman"),
        strip.text = element_text(family = "Times New Roman"))

figA3.1
ggsave("~/dev/dao-data/penn_research/what_makes_an_effective_prosecutor/exports/figA3.1.png", 
       device = "png", width = 10, height = 4, units = c("in"), dpi = 300)


figA3.2 <- ggplot(balance_results_uo %>% filter(variable %in% c("raw_diff_downcharge", "bench_diff_downcharge")), 
                  aes(x = value, fill = variable)) +
  geom_histogram(alpha = 0.6, position = "identity", bins = 30) +
  labs(x = "Difference in Percent of Caseload Downcharged by Prior ADA", 
       y = "Count of MC Unit ADAs", 
       fill = "") +
  scale_fill_manual(values = c("raw_diff_downcharge" = "darkblue", "bench_diff_downcharge" = "red"),
                    labels = c("raw_diff_downcharge" = "Raw\nDifference", 
                               "bench_diff_downcharge" = "Benchmarked\nDifference")) +
  scale_x_continuous(breaks = seq(floor(min(balance_results_uo$value, na.rm = TRUE)), 
                                  ceiling(max(balance_results_uo$value, na.rm = TRUE)), 
                                  by = 0.05)) +
  theme_light(base_family = "Times New Roman") +
  theme(text = element_text(family = "Times New Roman"),
        legend.text = element_text(family = "Times New Roman"),
        axis.title = element_text(family = "Times New Roman"),
        axis.text = element_text(family = "Times New Roman"),
        legend.title = element_text(family = "Times New Roman"),
        plot.title = element_text(family = "Times New Roman"),
        strip.text = element_text(family = "Times New Roman"))

figA3.2
ggsave("~/dev/dao-data/penn_research/what_makes_an_effective_prosecutor/exports/figA3.2.png", 
       device = "png", width = 10, height = 4, units = c("in"), dpi = 300)

